Basic scMultiome RNA-seq data analysis

Vignette for Seurat: https://satijalab.org/seurat/articles/pbmc3k_tutorial.html

Overview

  1. Seurat objects
    a) Sample statistics
    b) Cell Statistics
  2. Filtering
    a) Cell filtering by mitochondrial gene expression
    b) Cell filtering by number of UMI counts
  3. Doublet detection
    a) Scrublet
    b) DoubletFinder
    c) Doublet removal
  4. Preprocessing
    a) Log normalization and scaling
    b) SCTransform
  5. Dimensionality reduction and single-cell embedding
    a) Identification of highly variable features (feature selection)
    b) PCA
    c) Clustering
    d) Non-linear dimension reduction
  6. Cell cycle correction
    a) Cell cycle scoring
    b) Cell cycle correction
    c) Single-cell embedding of cell cycle corrected objects

Input data

  1. 10x Genomics Multiome-scRNA-seq of PBMCs
    PBMC_Multiome_10xTn5_CellRangerARC/raw_feature_bc_matrix
  2. 10x Genomics Multiome-scRNA-seq in combination with scTurboATAC-seq using Tn5hc 50 % (1:6 concentration) of PBMCs
    PBMC_Multiome_Tn5hc_50perc_CellRangerARC/raw_feature_bc_matrix
  3. 10x Genomics Multiome-scRNA-seq in combination with scTurboATAC-seq using Tn5hc (1:3 concentration) of PBMCs
    PBMC_Multiome_Tn5hc_CellRangerARC/raw_feature_bc_matrix

Notebook

20230912_PBMC_Multiome_scRNA_UMAP.ipynb


0. Load libraries, scripts and data

### Libraries
library(qs)
library(dplyr)
library(Seurat)
library(patchwork)
library(ggpubr)

Sys.setenv(RETICULATE_PYTHON = "miniconda3/envs/Scrublet_py3/bin/python3")
library(reticulate)

library(DoubletFinder)
##### Settings for regression during scaling of data
## BLAS Control threads:
library(RhpcBLASctl)
blas_set_num_threads(1)
omp_set_num_threads(1)
#optional
library(future)
plan("multicore",workers=9)
options(future.globals.maxSize= 5000*1024^2)
### Samples
samples <- c("PBMC_scATAC", "PBMC_scTurboATAC_16", "PBMC_scTurboATAC_13")

### Sample palette
sample_palette <- c("grey", "blue", "darkblue")
names(sample_palette) <- samples
### Data
inputFiles <- c("PBMC_Multiome_10xTn5_CellRangerARC/raw_feature_bc_matrix",
                "PBMC_Multiome_Tn5hc_50perc_CellRangerARC/raw_feature_bc_matrix",
                "PBMC_Multiome_Tn5hc_CellRangerARC/raw_feature_bc_matrix")
names(inputFiles) <- samples

1. Seurat objects

overview
input_data <- list()
for (sample in samples){
    input_data[[sample]] <- Read10X(data.dir = inputFiles[sample])
}

Default filtering criteria for cells:

  • nFeatures > 100
proj_list <- list()
for (sample in samples){
    proj_list[[sample]] <- CreateSeuratObject(counts = input_data[[sample]][["Gene Expression"]], project = sample, min.cells = 3, min.features = 100)
    proj_list[[sample]]$orig.ident <- sample
}

a) Sample statistics

for (sample in samples){
    proj_list[[sample]][["percent.mt"]] <- PercentageFeatureSet(proj_list[[sample]], pattern = "^MT-")
}
p1 <- ggplot(lapply(proj_list, function(x){data.frame(Sample = factor(x$orig.ident, levels = samples), 
                                                      y = x$nFeature_RNA)}) %>% do.call(rbind, .), 
             aes(x = Sample, y = y)) + 
        geom_violin(aes(fill = Sample), alpha = 0.8) + scale_fill_manual(values = sample_palette) +
        geom_boxplot(alpha = 0.001) + 
        ggtitle("Number of features") + xlab("") + ylab("nFeature_RNA") +
        theme_minimal() + theme(legend.position = "none", plot.title = element_text(size=14), 
                                axis.title = element_text(size=12), axis.text = element_text(size=12), 
                                axis.text.x=element_text(angle=45, vjust=1, hjust=1), 
                                legend.title = element_text(size=12), legend.text = element_text(size=12))
p2 <- ggplot(lapply(proj_list, function(x){data.frame(Sample = factor(x$orig.ident, levels = samples), 
                                                      y = log10(x$nCount_RNA))}) %>% do.call(rbind, .), 
             aes(x = Sample, y = y)) + 
        geom_violin(aes(fill = Sample), alpha = 0.8) + scale_fill_manual(values = sample_palette) +
        geom_boxplot(alpha = 0.001) + ylim(2, NA) +
        ggtitle("Number of UMI counts") + xlab("") + ylab("log10nCount_RNA") +
        theme_minimal() + theme(legend.position = "none", plot.title = element_text(size=14), 
                                axis.title = element_text(size=12), axis.text = element_text(size=12), 
                                axis.text.x=element_text(angle=45, vjust=1, hjust=1), 
                                legend.title = element_text(size=12), legend.text = element_text(size=12))
p3 <- ggplot(lapply(proj_list, function(x){data.frame(Sample = factor(x$orig.ident, levels = samples), 
                                                      y = x$percent.mt)}) %>% do.call(rbind, .), 
             aes(x = Sample, y = y)) + 
        geom_violin(aes(fill = Sample), alpha = 0.8) + scale_fill_manual(values = sample_palette) +
        geom_boxplot(alpha = 0.001) + 
        ggtitle("Percent of mitochondrial reads") + xlab("") + ylab("percent_mt") +
        theme_minimal() + theme(legend.position = "none", plot.title = element_text(size=14), 
                                axis.title = element_text(size=12), axis.text = element_text(size=12), 
                                axis.text.x=element_text(angle=45, vjust=1, hjust=1), 
                                legend.title = element_text(size=12), legend.text = element_text(size=12))
options(repr.plot.width=12, repr.plot.height=6)
ggarrange(p1, p2, p3, ncol = 3, nrow = 1)

b) Cell Statistics

plot_list <- list()

for (sample in samples){
    plot_list[[paste0(sample, "_nCount_percentMT")]] <- FeatureScatter(proj_list[[sample]], feature1 = "nCount_RNA", feature2 = "percent.mt", 
                                                                       cols = sample_palette, group.by = "orig.ident") + 
                                                            theme(legend.position="none") + scale_x_log10(labels = scales::comma) +
                                                            ggtitle(paste(sample, " (", length(proj_list[[sample]]$orig.ident), "cells)")) + 
                                                            ylim(c(0,80))
    plot_list[[paste0(sample, "_nCount_nFeature")]] <- FeatureScatter(proj_list[[sample]], feature1 = "nCount_RNA", feature2 = "nFeature_RNA",  
                                                                      cols = sample_palette, group.by = "orig.ident") + 
                                                            theme(legend.position="none") + ggtitle(paste(sample, " (", length(proj_list[[sample]]$orig.ident), "cells)")) + 
                                                            ylim(c(0,10000))
}
options(repr.plot.width=12, repr.plot.height=5*length(samples))
ggarrange(plotlist=plot_list, ncol=2, nrow=length(samples))

2. Filtering

overview
summary <- lapply(samples, 
                       function(sample){c(sample, '01_raw',
                                     nrow(proj_list[[sample]]@meta.data),
                                     median(proj_list[[sample]]$nFeature_RNA) %>% round(., 0),
                                     median(proj_list[[sample]]$nCount_RNA) %>% round(., 0),
                                     median(proj_list[[sample]]$percent.mt) %>% round(., 0))}) %>% do.call(rbind, .)
colnames(summary) <- c('sample', 'step', 'nCells', 'nFeature', 'nCount', 'percent.mt')
summary
A matrix: 3 × 6 of type chr
samplestepnCellsnFeaturenCountpercent.mt
PBMC_scATAC 01_raw7365 135925337
PBMC_scTurboATAC_1601_raw10896112018417
PBMC_scTurboATAC_1301_raw10976106616438

a) Cell filtering by mitochondrial gene expression

Adapted filtering criteria for cells:

  • nFeatures > 100
  • percent.mt < 40
for (sample in samples){
    proj_list[[sample]] <- subset(proj_list[[sample]], subset = percent.mt < 40) 
}
p1 <- ggplot(lapply(proj_list, function(x){data.frame(Sample = factor(x$orig.ident, levels = samples), 
                                                      y = x$nFeature_RNA)}) %>% do.call(rbind, .), 
             aes(x = Sample, y = y)) + 
        geom_violin(aes(fill = Sample), alpha = 0.8) + scale_fill_manual(values = sample_palette) +
        geom_boxplot(alpha = 0.001) + 
        ggtitle("Number of features") + xlab("") + ylab("nFeature_RNA") +
        theme_minimal() + theme(legend.position = "none", plot.title = element_text(size=14), 
                                axis.title = element_text(size=12), axis.text = element_text(size=12), 
                                axis.text.x=element_text(angle=45, vjust=1, hjust=1), 
                                legend.title = element_text(size=12), legend.text = element_text(size=12))
p2 <- ggplot(lapply(proj_list, function(x){data.frame(Sample = factor(x$orig.ident, levels = samples), 
                                                      y = log10(x$nCount_RNA))}) %>% do.call(rbind, .), 
             aes(x = Sample, y = y)) + 
        geom_violin(aes(fill = Sample), alpha = 0.8) + scale_fill_manual(values = sample_palette) +
        geom_boxplot(alpha = 0.001) + ylim(2, NA) +
        ggtitle("Number of UMI counts") + xlab("") + ylab("log10nCount_RNA") +
        theme_minimal() + theme(legend.position = "none", plot.title = element_text(size=14), 
                                axis.title = element_text(size=12), axis.text = element_text(size=12), 
                                axis.text.x=element_text(angle=45, vjust=1, hjust=1), 
                                legend.title = element_text(size=12), legend.text = element_text(size=12))
p3 <- ggplot(lapply(proj_list, function(x){data.frame(Sample = factor(x$orig.ident, levels = samples), 
                                                      y = x$percent.mt)}) %>% do.call(rbind, .), 
             aes(x = Sample, y = y)) + 
        geom_violin(aes(fill = Sample), alpha = 0.8) + scale_fill_manual(values = sample_palette) +
        geom_boxplot(alpha = 0.001) + 
        ggtitle("Percent of mitochondrial reads") + xlab("") + ylab("percent_mt") +
        geom_hline(yintercept = c(40)) +
        theme_minimal() + theme(legend.position = "none", plot.title = element_text(size=14), 
                                axis.title = element_text(size=12), axis.text = element_text(size=12), 
                                axis.text.x=element_text(angle=45, vjust=1, hjust=1), 
                                legend.title = element_text(size=12), legend.text = element_text(size=12))
options(repr.plot.width=12, repr.plot.height=6)
ggarrange(p1, p2, p3, ncol = 3, nrow = 1)
plot_list <- list()

for (sample in samples){
    plot_list[[paste0(sample, "_nCount_percentMT")]] <- FeatureScatter(proj_list[[sample]], feature1 = "nCount_RNA", feature2 = "percent.mt", 
                                                                       cols = sample_palette, group.by = "orig.ident") + 
                                                            theme(legend.position="none") + scale_x_log10(labels = scales::comma) +
                                                            ggtitle(paste(sample, " (", length(proj_list[[sample]]$orig.ident), "cells)")) + 
                                                            ylim(c(0,80)) + geom_hline(yintercept = c(40))
    plot_list[[paste0(sample, "_nCount_nFeature")]] <- FeatureScatter(proj_list[[sample]], feature1 = "nCount_RNA", feature2 = "nFeature_RNA",  
                                                                      cols = sample_palette, group.by = "orig.ident") + 
                                                            theme(legend.position="none") + ggtitle(paste(sample, " (", length(proj_list[[sample]]$orig.ident), "cells)")) + 
                                                            ylim(c(0,10000))
}
options(repr.plot.width=12, repr.plot.height=5*length(samples))
ggarrange(plotlist=plot_list, ncol=2, nrow=length(samples))
summary <- lapply(samples, 
                       function(sample){c(sample, '02A_percent.mt',
                                     nrow(proj_list[[sample]]@meta.data),
                                     median(proj_list[[sample]]$nFeature_RNA) %>% round(., 0),
                                     median(proj_list[[sample]]$nCount_RNA) %>% round(., 0),
                                     median(proj_list[[sample]]$percent.mt) %>% round(., 0))}) %>% do.call(rbind, .) %>% rbind(summary, .)
colnames(summary) <- c('sample', 'step', 'nCells', 'nFeature', 'nCount', 'percent.mt')
summary
A matrix: 6 × 6 of type chr
samplestepnCellsnFeaturenCountpercent.mt
PBMC_scATAC 01_raw 7365 135925337
PBMC_scTurboATAC_1601_raw 10896112018417
PBMC_scTurboATAC_1301_raw 10976106616438
PBMC_scATAC 02A_percent.mt7238 139226247
PBMC_scTurboATAC_1602A_percent.mt10764113018647
PBMC_scTurboATAC_1302A_percent.mt10882107416668

b) Cell filtering by number of UMI counts

Adapted filtering criteria for cells:

  • nFeature > 100
  • percent.mt < 40
  • nCount > 500
for (sample in samples){
    proj_list[[sample]] <- subset(proj_list[[sample]], subset = nCount_RNA  > 500) 
}
p1 <- ggplot(lapply(proj_list, function(x){data.frame(Sample = factor(x$orig.ident, levels = samples), 
                                                      y = x$nFeature_RNA)}) %>% do.call(rbind, .), 
             aes(x = Sample, y = y)) + 
        geom_violin(aes(fill = Sample), alpha = 0.8) + scale_fill_manual(values = sample_palette) +
        geom_boxplot(alpha = 0.001) + 
        ggtitle("Number of features") + xlab("") + ylab("nFeature_RNA") +
        theme_minimal() + theme(legend.position = "none", plot.title = element_text(size=14), 
                                axis.title = element_text(size=12), axis.text = element_text(size=12), 
                                axis.text.x=element_text(angle=45, vjust=1, hjust=1), 
                                legend.title = element_text(size=12), legend.text = element_text(size=12))
p2 <- ggplot(lapply(proj_list, function(x){data.frame(Sample = factor(x$orig.ident, levels = samples), 
                                                      y = log10(x$nCount_RNA))}) %>% do.call(rbind, .), 
             aes(x = Sample, y = y)) + 
        geom_violin(aes(fill = Sample), alpha = 0.8) + scale_fill_manual(values = sample_palette) +
        geom_boxplot(alpha = 0.001) + 
        ggtitle("Number of UMI counts") + xlab("") + ylab("log10nCount_RNA") +
        geom_hline(yintercept = c(log10(500))) + ylim(2, NA) +
        theme_minimal() + theme(legend.position = "none", plot.title = element_text(size=14), 
                                axis.title = element_text(size=12), axis.text = element_text(size=12), 
                                axis.text.x=element_text(angle=45, vjust=1, hjust=1), 
                                legend.title = element_text(size=12), legend.text = element_text(size=12))
p3 <- ggplot(lapply(proj_list, function(x){data.frame(Sample = factor(x$orig.ident, levels = samples), 
                                                      y = x$percent.mt)}) %>% do.call(rbind, .), 
             aes(x = Sample, y = y)) + 
        geom_violin(aes(fill = Sample), alpha = 0.8) + scale_fill_manual(values = sample_palette) +
        geom_boxplot(alpha = 0.001) + 
        ggtitle("Percent of mitochondrial reads") + xlab("") + ylab("percent_mt") +
        geom_hline(yintercept = c(40)) +
        theme_minimal() + theme(legend.position = "none", plot.title = element_text(size=14), 
                                axis.title = element_text(size=12), axis.text = element_text(size=12), 
                                axis.text.x=element_text(angle=45, vjust=1, hjust=1), 
                                legend.title = element_text(size=12), legend.text = element_text(size=12))
options(repr.plot.width=12, repr.plot.height=6)
ggarrange(p1, p2, p3, ncol = 3, nrow = 1)
plot_list <- list()

for (sample in samples){
    plot1 <- lapply(unique(proj_list[[sample]]$orig.ident), 
                    function(x){FeatureScatter(proj_list[[sample]], feature1 = "nCount_RNA", feature2 = "percent.mt", 
                                               cols = sample_palette, group.by = "orig.ident") + 
                                    theme(legend.position="none") + scale_x_log10() +
                                    ggtitle(paste(x, " (", length(proj_list[[sample]]$orig.ident), "cells)")) + 
                                    ylim(c(0,80)) + geom_hline(yintercept = c(40)) + geom_vline(xintercept = c(500)) }) %>%
                                ggarrange(plotlist=., ncol=1, nrow=1)
    plot2 <- lapply(unique(proj_list[[sample]]$orig.ident), 
                    function(x){FeatureScatter(proj_list[[sample]], feature1 = "nCount_RNA", feature2 = "nFeature_RNA",  
                                               cols = sample_palette, group.by = "orig.ident") + 
                                theme(legend.position="none") + ggtitle(paste(x, " (", length(proj_list[[sample]]$orig.ident), "cells)")) + 
                                ylim(c(0,10000)) + geom_vline(xintercept = c(500)) }) %>%
                ggarrange(plotlist=., ncol=1, nrow=1)
    plot_list[[sample]] <- ggarrange(plot1, plot2, ncol=1, nrow=2)
}
options(repr.plot.width=20, repr.plot.height=14)
ggarrange(plotlist=plot_list, ncol=3, nrow=1)
summary <- lapply(samples, 
                       function(sample){c(sample, '02B_percent.mt_nCount',
                                     nrow(proj_list[[sample]]@meta.data),
                                     median(proj_list[[sample]]$nFeature_RNA) %>% round(., 0),
                                     median(proj_list[[sample]]$nCount_RNA) %>% round(., 0),
                                     median(proj_list[[sample]]$percent.mt) %>% round(., 0))}) %>% do.call(rbind, .) %>% rbind(summary, .)
colnames(summary) <- c('sample', 'step', 'nCells', 'nFeature', 'nCount', 'percent.mt')
summary
A matrix: 9 × 6 of type chr
samplestepnCellsnFeaturenCountpercent.mt
PBMC_scATAC 01_raw 7365 135925337
PBMC_scTurboATAC_1601_raw 10896112018417
PBMC_scTurboATAC_1301_raw 10976106616438
PBMC_scATAC 02A_percent.mt 7238 139226247
PBMC_scTurboATAC_1602A_percent.mt 10764113018647
PBMC_scTurboATAC_1302A_percent.mt 10882107416668
PBMC_scATAC 02B_percent.mt_nCount4587 189141938
PBMC_scTurboATAC_1602B_percent.mt_nCount7916 131323026
PBMC_scTurboATAC_1302B_percent.mt_nCount7101 139324885

3. Doublet detection

overview

Low-dimensional embedding for evaluation of doublets

plot_list <- list()

for (sample in samples){
    proj_list[[sample]] <- NormalizeData(proj_list[[sample]], normalization.method = "LogNormalize", scale.factor = 10000)
    proj_list[[sample]] <- ScaleData(proj_list[[sample]], features = rownames(proj_list[[sample]]))
    
    proj_list[[sample]] <- FindVariableFeatures(proj_list[[sample]], selection.method = "vst", nfeatures = 2000)
    proj_list[[sample]] <- RunPCA(proj_list[[sample]], features = VariableFeatures(proj_list[[sample]]))
    plot_list[[sample]] <- ElbowPlot(proj_list[[sample]]) + ggtitle(sample)
}
options(repr.plot.width=5*length(samples), repr.plot.height=5)
ggpubr::ggarrange(plotlist = plot_list, ncol=length(samples))
PC_vector <- c(20, 20, 20)
names(PC_vector) <- samples
for (sample in samples){
    proj_list[[sample]] <- FindNeighbors(proj_list[[sample]], dims = 1:PC_vector[sample])
    proj_list[[sample]] <- RunUMAP(proj_list[[sample]], dims = 1:PC_vector[sample])
}

a) Scrublet

proj_list <- lapply(samples, function(sample){mat <- GetAssayData(object = proj_list[[sample]], assay = "RNA", slot = "counts");
                                 reticulate::use_condaenv("/home/bq_ilander/miniconda3/envs/Scrublet_py3/bin/python3");
                                 reticulate::import("os")
                                 Matrix::writeMM(mat, file="count_matrix")
                                 source_python("/media/ag-rippe2/isabelle/_PythonScripts/Scrublet.py")
                                 x <- as.numeric(as.data.frame(data.table::fread("count_matrix.doubletScores", sep = ",", header = F, skip = 1))[, 2])
                                 names(x) <- colnames(mat)
                                 file.remove("count_matrix")
                                 file.remove("count_matrix.doubletScores")
                                      
                                 proj_list[[sample]] <- AddMetaData(proj_list[[sample]], x, "Scrublet_Score")
                                 return(proj_list[[sample]])
})
names(proj_list) <- samples
plot_list <- list()
options(repr.plot.width=5*length(samples), repr.plot.height=6)
par(mfrow=c(1,length(samples)))
for (sample in samples){
    hist(proj_list[[sample]]$Scrublet_Score, breaks=50, main = sample)
    abline(v = c(0.15), col="red")
}
for (sample in samples){
    print(sample)
    table(proj_list[[sample]]$Scrublet_Score >= 0.15) %>% print(.)
    
    proj_list[[sample]]$Doublets <- proj_list[[sample]]$Scrublet_Score >= 0.15
}
[1] "PBMC_scATAC"

FALSE  TRUE 
 4277   310 
[1] "PBMC_scTurboATAC_16"

FALSE  TRUE 
 7389   527 
[1] "PBMC_scTurboATAC_13"

FALSE  TRUE 
 6529   572 
plot_list <- list()

for (sample in samples){
    plot_list[["Scrublet_Score"]][[sample]] <- FeaturePlot(proj_list[[sample]], "Scrublet_Score") + ggtitle(paste(sample, "- Scrublet Scores"))
    plot_list[["Doublets"]][[sample]] <- FeaturePlot(proj_list[[sample]], "Doublets") + ggtitle(paste(sample, "- Doublets (cutoff 0.15)"))
    
    plot_list[["Scatter_nCount"]][[sample]] <- FeatureScatter(proj_list[[sample]], feature1 = "nCount_RNA", feature2 = "Scrublet_Score", group.by = "Doublets") + ggtitle(paste(sample, "- Doublets (0.15)"))
    plot_list[["Scatter_nFeature"]][[sample]] <- FeatureScatter(proj_list[[sample]], feature1 = "nFeature_RNA", feature2 = "Scrublet_Score", group.by = "Doublets") + ggtitle(paste(sample, "- Doublets (0.15)"))
    plot_list[["Scatter_percent.mt"]][[sample]] <- FeatureScatter(proj_list[[sample]], feature1 = "percent.mt", feature2 = "Scrublet_Score", group.by = "Doublets") + ggtitle(paste(sample, "- Doublets (0.15)"))
}
options(repr.plot.width=6*length(samples), repr.plot.height=5)
ggarrange(plotlist = plot_list[["Scrublet_Score"]], ncol=length(samples))
ggarrange(plotlist = plot_list[["Doublets"]], ncol=length(samples))
ggarrange(plotlist = plot_list[["Scatter_nCount"]], ncol=length(samples))
ggarrange(plotlist = plot_list[["Scatter_nFeature"]], ncol=length(samples))
ggarrange(plotlist = plot_list[["Scatter_percent.mt"]], ncol=length(samples))

b) DoubletFinder

DF_params <- list()

for (sample in samples){
    print(sample)
    y <- paramSweep_v3(proj_list[[sample]], PCs = 1:PC_vector[sample], sct = FALSE, num.cores = 10)
    y <- summarizeSweep(y, GT = FALSE)
    DF_params[[sample]] <- find.pK(y)
}
# Optimal pK for any scRNA-seq data can be manually discerned as maxima in BCmvn distributions.
pK_vector <- character()

for (sample in samples){
    pK_vector[sample] <- as.character(DF_params[[sample]]$pK[DF_params[[sample]]$BCmetric == max(DF_params[[sample]]$BCmetric)])
}

pK_vector <- as.numeric(pK_vector)
names(pK_vector) <- samples
for (sample in samples){
    proj_list[[sample]] <- doubletFinder_v3(proj_list[[sample]], PCs = 1:PC_vector[sample], pK = pK_vector[sample], 
                                            nExp = round(0.046*nrow(proj_list[[sample]]@meta.data)), reuse.pANN = FALSE, sct = FALSE)
}
plot_list <- list()

for (sample in samples){
    plot_list[["DoubletFinder"]][[sample]] <- DimPlot(proj_list[[sample]], 
                                                            group.by = grep("DF.classifications", colnames(proj_list[[sample]]@meta.data), value = TRUE),
                                                            cols = c("red", "grey")) + 
                                                        ggtitle(paste(sample, "- DoubletFinder"))
}
for (sample in samples){
    print(sample)
    print(table(proj_list[[sample]]@meta.data[, grep("DF.classifications", colnames(proj_list[[sample]]@meta.data), value = TRUE)]))
}
[1] "PBMC_scATAC"

Doublet Singlet 
    211    4376 
[1] "PBMC_scTurboATAC_16"

Doublet Singlet 
    364    7552 
[1] "PBMC_scTurboATAC_13"

Doublet Singlet 
    327    6774 
options(repr.plot.width=6*length(samples), repr.plot.height=5)
ggpubr::ggarrange(plotlist = plot_list[["DoubletFinder"]], ncol=length(samples))

c) Doublet removal

for (sample in samples){
    proj_list[[sample]] <- subset(proj_list[[sample]], subset = Doublets == FALSE) 
}
p1 <- ggplot(lapply(proj_list, function(x){data.frame(Sample = factor(x$orig.ident, levels = samples), 
                                                      y = x$nFeature_RNA)}) %>% do.call(rbind, .), 
             aes(x = Sample, y = y)) + 
        geom_violin(aes(fill = Sample), alpha = 0.8) + scale_fill_manual(values = sample_palette) +
        geom_boxplot(alpha = 0.001) + 
        ggtitle("Number of features") + xlab("") + ylab("nFeature_RNA") +
        theme_minimal() + theme(legend.position = "none", plot.title = element_text(size=14), 
                                axis.title = element_text(size=12), axis.text = element_text(size=12), 
                                axis.text.x=element_text(angle=45, vjust=1, hjust=1), 
                                legend.title = element_text(size=12), legend.text = element_text(size=12))
p2 <- ggplot(lapply(proj_list, function(x){data.frame(Sample = factor(x$orig.ident, levels = samples), 
                                                      y = log10(x$nCount_RNA))}) %>% do.call(rbind, .), 
             aes(x = Sample, y = y)) + 
        geom_violin(aes(fill = Sample), alpha = 0.8) + scale_fill_manual(values = sample_palette) +
        geom_boxplot(alpha = 0.001) + 
        ggtitle("Number of UMI counts") + xlab("") + ylab("log10nCount_RNA") +
        geom_hline(yintercept = c(log10(500))) + ylim(2, NA) +
        theme_minimal() + theme(legend.position = "none", plot.title = element_text(size=14), 
                                axis.title = element_text(size=12), axis.text = element_text(size=12), 
                                axis.text.x=element_text(angle=45, vjust=1, hjust=1), 
                                legend.title = element_text(size=12), legend.text = element_text(size=12))
p3 <- ggplot(lapply(proj_list, function(x){data.frame(Sample = factor(x$orig.ident, levels = samples), 
                                                      y = x$percent.mt)}) %>% do.call(rbind, .), 
             aes(x = Sample, y = y)) + 
        geom_violin(aes(fill = Sample), alpha = 0.8) + scale_fill_manual(values = sample_palette) +
        geom_boxplot(alpha = 0.001) + 
        ggtitle("Percent of mitochondrial reads") + xlab("") + ylab("percent_mt") +
        geom_hline(yintercept = c(40)) +
        theme_minimal() + theme(legend.position = "none", plot.title = element_text(size=14), 
                                axis.title = element_text(size=12), axis.text = element_text(size=12), 
                                axis.text.x=element_text(angle=45, vjust=1, hjust=1), 
                                legend.title = element_text(size=12), legend.text = element_text(size=12))
options(repr.plot.width=12, repr.plot.height=6)
ggarrange(p1, p2, p3, ncol = 3, nrow = 1)
plot_list <- list()

for (sample in samples){
    plot1 <- lapply(unique(proj_list[[sample]]$orig.ident), 
                    function(x){FeatureScatter(proj_list[[sample]], feature1 = "nCount_RNA", feature2 = "percent.mt", 
                                               cols = sample_palette, group.by = "orig.ident") + 
                                    theme(legend.position="none") + scale_x_log10() +
                                    ggtitle(paste(x, " (", length(proj_list[[sample]]$orig.ident), "cells)")) + 
                                    ylim(c(0,80)) + geom_hline(yintercept = c(40)) + geom_vline(xintercept = c(500)) }) %>%
                                ggarrange(plotlist=., ncol=1, nrow=1)
    plot2 <- lapply(unique(proj_list[[sample]]$orig.ident), 
                    function(x){FeatureScatter(proj_list[[sample]], feature1 = "nCount_RNA", feature2 = "nFeature_RNA",  
                                               cols = sample_palette, group.by = "orig.ident") + 
                                theme(legend.position="none") + ggtitle(paste(x, " (", length(proj_list[[sample]]$orig.ident), "cells)")) + 
                                ylim(c(0,10000)) + geom_vline(xintercept = c(500)) }) %>%
                ggarrange(plotlist=., ncol=1, nrow=1)
    plot_list[[sample]] <- ggarrange(plot1, plot2, ncol=1, nrow=2)
}
options(repr.plot.width=20, repr.plot.height=14)
ggarrange(plotlist=plot_list, ncol=3, nrow=1)
summary <- lapply(samples, 
                       function(sample){c(sample, '03_percent.mt_nCount_doublets',
                                     nrow(proj_list[[sample]]@meta.data),
                                     median(proj_list[[sample]]$nFeature_RNA) %>% round(., 0),
                                     median(proj_list[[sample]]$nCount_RNA) %>% round(., 0),
                                     median(proj_list[[sample]]$percent.mt) %>% round(., 0))}) %>% do.call(rbind, .) %>% rbind(summary, .)
colnames(summary) <- c('sample', 'step', 'nCells', 'nFeature', 'nCount', 'percent.mt')
summary
A matrix: 12 × 6 of type chr
samplestepnCellsnFeaturenCountpercent.mt
PBMC_scATAC 01_raw 7365 135925337
PBMC_scTurboATAC_1601_raw 10896112018417
PBMC_scTurboATAC_1301_raw 10976106616438
PBMC_scATAC 02A_percent.mt 7238 139226247
PBMC_scTurboATAC_1602A_percent.mt 10764113018647
PBMC_scTurboATAC_1302A_percent.mt 10882107416668
PBMC_scATAC 02B_percent.mt_nCount 4587 189141938
PBMC_scTurboATAC_1602B_percent.mt_nCount 7916 131323026
PBMC_scTurboATAC_1302B_percent.mt_nCount 7101 139324885
PBMC_scATAC 03_percent.mt_nCount_doublets4277 185540688
PBMC_scTurboATAC_1603_percent.mt_nCount_doublets7389 129422596
PBMC_scTurboATAC_1303_percent.mt_nCount_doublets6529 136924275

4. Preprocessing

overview

a) Log normalization and scaling

for (sample in samples){
    proj_list[[sample]] <- NormalizeData(proj_list[[sample]], normalization.method = "LogNormalize", scale.factor = 10000)
    proj_list[[sample]] <- ScaleData(proj_list[[sample]], features = rownames(proj_list[[sample]]))
}

b) SCTransform

for (sample in samples){
    proj_list[[sample]] <- SCTransform(proj_list[[sample]])
}

5. Dimensionality reduction and single-cell embedding

overview

a) Identification of highly variable features (feature selection)

plot_list <- list()

for (sample in samples){
    proj_list[[sample]] <- FindVariableFeatures(proj_list[[sample]], assay = "RNA", selection.method = "vst", nfeatures = 2000)
    proj_list[[sample]] <- FindVariableFeatures(proj_list[[sample]], assay = "SCT", selection.method = "vst", nfeatures = 2000)

    # Identify the 10 most highly variable genes
    top10_RNA <- head(VariableFeatures(proj_list[[sample]], assay = "RNA"), 10)
    top10_SCT <- head(VariableFeatures(proj_list[[sample]], assay = "SCT"), 10)

    # plot variable features with and without labels
    plot_list[[paste0(sample, "_RNA")]] <- VariableFeaturePlot(proj_list[[sample]], assay = "RNA") %>% 
                                                    LabelPoints(plot = ., points = top10_RNA, repel = TRUE)
    plot_list[[paste0(sample, "_SCT")]] <- VariableFeaturePlot(proj_list[[sample]], assay = "SCT") %>% 
                                                    LabelPoints(plot = ., points = top10_SCT, repel = TRUE)
}
options(repr.plot.width=20, repr.plot.height=6*length(samples))
ggarrange(plotlist = plot_list, ncol = 2, nrow = length(samples))
Warning message:
“Transformation introduced infinite values in continuous x-axis”
Warning message:
“Transformation introduced infinite values in continuous x-axis”
Warning message:
“Transformation introduced infinite values in continuous x-axis”
Warning message:
“ggrepel: 7 unlabeled data points (too many overlaps). Consider increasing max.overlaps”
Warning message:
“ggrepel: 4 unlabeled data points (too many overlaps). Consider increasing max.overlaps”
Warning message:
“ggrepel: 8 unlabeled data points (too many overlaps). Consider increasing max.overlaps”
Warning message:
“ggrepel: 6 unlabeled data points (too many overlaps). Consider increasing max.overlaps”
Warning message:
“ggrepel: 8 unlabeled data points (too many overlaps). Consider increasing max.overlaps”
Warning message:
“ggrepel: 3 unlabeled data points (too many overlaps). Consider increasing max.overlaps”

b) PCA

plot_list <- list()

for (sample in samples){
    proj_list[[sample]] <- RunPCA(proj_list[[sample]], assay = "RNA", reduction.name = "pca_RNA", 
                                  features = VariableFeatures(proj_list[[sample]], assay = "RNA"))
    proj_list[[sample]] <- RunPCA(proj_list[[sample]], assay = "SCT", reduction.name = "pca_SCT", 
                                  features = VariableFeatures(proj_list[[sample]], assay = "SCT"))
    plot_list[[paste0(sample, "_RNA")]] <- ElbowPlot(proj_list[[sample]], reduction = "pca_RNA", ndims = 50) + ggtitle(sample)
    plot_list[[paste0(sample, "_SCT")]] <- ElbowPlot(proj_list[[sample]], reduction = "pca_SCT", ndims = 50) + ggtitle(sample)
}
options(repr.plot.width=12, repr.plot.height=5*length(samples))
ggarrange(plotlist = plot_list, ncol = 2, nrow = length(samples))
PCs_list <- rep(20, 6)
names(PCs_list) <- paste0(rep(samples, each=2), c("_RNA", "_SCT"))

c) Clustering

for (sample in samples){
    proj_list[[sample]] <- FindNeighbors(proj_list[[sample]], assay = "RNA", reduction = "pca_RNA", 
                                         dims = 1:PCs_list[[paste0(sample, "_RNA")]], graph.name = "RNA_snn")
    proj_list[[sample]] <- FindNeighbors(proj_list[[sample]], assay = "SCT", reduction = "pca_SCT", 
                                         dims = 1:PCs_list[[paste0(sample, "_SCT")]], graph.name = "SCT_snn")
    
    for (resolution in c(0.2, 0.5, 0.8)){
        proj_list[[sample]] <- FindClusters(proj_list[[sample]], resolution = resolution, graph.name = "RNA_snn")
        proj_list[[sample]] <- FindClusters(proj_list[[sample]], resolution = resolution, graph.name = "SCT_snn")
    }
}

d) Non-linear dimension reduction

for (sample in samples){
    proj_list[[sample]] <- RunUMAP(proj_list[[sample]], assay = "RNA", reduction = "pca_RNA", 
                                   dims = 1:PCs_list[[paste0(sample, "_RNA")]], reduction.name = "umap_RNA")
    proj_list[[sample]] <- RunUMAP(proj_list[[sample]], assay = "SCT", reduction = "pca_SCT", 
                                   dims = 1:PCs_list[[paste0(sample, "_SCT")]], reduction.name = "umap_SCT")
}
plot_list <- list()

for (sample in samples){
    plot_list[[sample]] <- list()
    
    plot_list[[sample]]$clusters0.2 <- DimPlot(proj_list[[sample]], reduction = "umap_RNA", group.by = "RNA_snn_res.0.2")
    plot_list[[sample]]$clusters0.5 <- DimPlot(proj_list[[sample]], reduction = "umap_RNA", group.by = "RNA_snn_res.0.5")
    plot_list[[sample]]$clusters0.8 <- DimPlot(proj_list[[sample]], reduction = "umap_RNA", group.by = "RNA_snn_res.0.8")
    
    plot_list[[sample]]$nCount <- FeaturePlot(proj_list[[sample]], reduction = "umap_RNA", features = "nCount_RNA")
    plot_list[[sample]]$nFeature <- FeaturePlot(proj_list[[sample]], reduction = "umap_RNA", features = "nFeature_RNA")
    plot_list[[sample]]$percentMT <- FeaturePlot(proj_list[[sample]], reduction = "umap_RNA", features = "percent.mt")

}
options(repr.plot.width=20, repr.plot.height=12)
for (sample in samples){
    ggarrange(plotlist = plot_list[[sample]], ncol = 3, nrow = 2, labels = sample) %>% print(.)
}
plot_list <- list()

for (sample in samples){
    plot_list[[sample]] <- list()
    
    plot_list[[sample]]$clusters0.2 <- DimPlot(proj_list[[sample]], reduction = "umap_SCT", group.by = "SCT_snn_res.0.2")
    plot_list[[sample]]$clusters0.5 <- DimPlot(proj_list[[sample]], reduction = "umap_SCT", group.by = "SCT_snn_res.0.5")
    plot_list[[sample]]$clusters0.8 <- DimPlot(proj_list[[sample]], reduction = "umap_SCT", group.by = "SCT_snn_res.0.8")
    
    plot_list[[sample]]$nCount <- FeaturePlot(proj_list[[sample]], reduction = "umap_SCT", features = "nCount_RNA")
    plot_list[[sample]]$nFeature <- FeaturePlot(proj_list[[sample]], reduction = "umap_SCT", features = "nFeature_RNA")
    plot_list[[sample]]$percentMT <- FeaturePlot(proj_list[[sample]], reduction = "umap_SCT", features = "percent.mt")

}
options(repr.plot.width=20, repr.plot.height=12)
for (sample in samples){
    ggarrange(plotlist = plot_list[[sample]], ncol = 3, nrow = 2, labels = sample) %>% print(.)
}

6. Cell cycle correction

overview

a) Cell cycle scoring

# A list of cell cycle markers, from Tirosh et al, 2015, is loaded with Seurat.  We can
# segregate this list into markers of G2/M phase and markers of S phase
# Filter out genes that are not in Seurat object
s.genes_list <- list()
g2m.genes_list <- list()

for (sample in samples){
    s.genes_list[[paste0(sample, "_RNA")]] <- cc.genes$s.genes[cc.genes$s.genes %in% rownames(proj_list[[sample]]@assays$RNA@meta.features)]
    g2m.genes_list[[paste0(sample, "_RNA")]] <- cc.genes$g2m.genes[cc.genes$g2m.genes %in% rownames(proj_list[[sample]]@assays$RNA@meta.features)]

    s.genes_list[[paste0(sample, "_SCT")]] <- cc.genes$s.genes[cc.genes$s.genes %in% rownames(proj_list[[sample]]@assays$SCT@meta.features)]
    g2m.genes_list[[paste0(sample, "_SCT")]] <- cc.genes$g2m.genes[cc.genes$g2m.genes %in% rownames(proj_list[[sample]]@assays$SCT@meta.features)]

}
for (sample in samples){
    proj_list[[sample]] <- CellCycleScoring(proj_list[[sample]], assay = "RNA", 
                                            s.features = s.genes_list[[paste0(sample, "_RNA")]], 
                                            g2m.features = g2m.genes_list[[paste0(sample, "_RNA")]], set.ident = TRUE)
    colnames(proj_list[[sample]]@meta.data)[grep("^S.Score$|^G2M.Score$|^Phase$", 
                                                 colnames(proj_list[[sample]]@meta.data))] <- paste0(c("S.Score", "G2M.Score", "Phase"), 
                                                                                                     "_RNA")
    
    proj_list[[sample]] <- CellCycleScoring(proj_list[[sample]], assay = "SCT", 
                                            s.features = s.genes_list[[paste0(sample, "_SCT")]], 
                                            g2m.features = g2m.genes_list[[paste0(sample, "_SCT")]], set.ident = TRUE)
    colnames(proj_list[[sample]]@meta.data)[grep("^S.Score$|^G2M.Score$|^Phase$", 
                                                 colnames(proj_list[[sample]]@meta.data))] <- paste0(c("S.Score", "G2M.Score", "Phase"), 
                                                                                                     "_SCT")

    proj_list[[sample]] <- RunPCA(proj_list[[sample]], assay = "RNA", 
                                  features = c(s.genes_list[[paste0(sample, "_RNA")]], g2m.genes_list[[paste0(sample, "_RNA")]]), 
                                  reduction.name = "pca_RNA_s.g2m.genes")
    proj_list[[sample]] <- RunPCA(proj_list[[sample]], assay = "SCT", 
                                  features = c(s.genes_list[[paste0(sample, "_SCT")]], g2m.genes_list[[paste0(sample, "_SCT")]]), 
                                  reduction.name = "pca_SCT_s.g2m.genes")
}
for (sample in samples){
    print(sample)
    print("RNA")
    table(proj_list[[sample]]$Phase_RNA) %>% print(.)

    print("SCT")
    table(proj_list[[sample]]$Phase_SCT) %>% print(.)
}
[1] "PBMC_scATAC"
[1] "RNA"

  G1  G2M    S 
1333 1527 1417 
[1] "SCT"

  G1  G2M    S 
1443 1422 1412 
[1] "PBMC_scTurboATAC_16"
[1] "RNA"

  G1  G2M    S 
2737 2542 2110 
[1] "SCT"

  G1  G2M    S 
2482 2379 2528 
[1] "PBMC_scTurboATAC_13"
[1] "RNA"

  G1  G2M    S 
2398 2273 1858 
[1] "SCT"

  G1  G2M    S 
2205 1950 2374 
plot_list <- list()

for (sample in samples){
    p1 <- DimPlot(proj_list[[sample]], reduction = "pca_RNA_s.g2m.genes") + ggtitle(sample)
    p2 <- DimPlot(proj_list[[sample]], reduction = "umap_RNA", group.by ="Phase_RNA")
    p3 <- DimPlot(proj_list[[sample]], reduction = "pca_SCT_s.g2m.genes")
    p4 <- DimPlot(proj_list[[sample]], reduction = "umap_SCT", group.by = "Phase_SCT")
    
    plot_list[[sample]] <- ggarrange(p1, p2, p3, p4, ncol=4)
}
options(repr.plot.width=20, repr.plot.height=5*length(samples))
ggarrange(plotlist = plot_list, ncol = 1, nrow = length(samples))

b) Cell cycle correction

projCC_list <- list()

for (sample in samples){
    projCC_list[[sample]] <- ScaleData(proj_list[[sample]], assay = "RNA",
                                       vars.to.regress = c("S.Score_RNA", "G2M.Score_RNA"), 
                                       features = rownames(proj_list[[sample]]))
    projCC_list[[sample]] <- ScaleData(projCC_list[[sample]], assay = "SCT",
                                       vars.to.regress = c("S.Score_SCT", "G2M.Score_SCT"), 
                                       features = rownames(projCC_list[[sample]]))
}
plot_list <- list()

for (sample in samples){
    projCC_list[[sample]] <- RunPCA(projCC_list[[sample]], assay = "RNA", 
                                    features = c(s.genes_list[[paste0(sample, "_RNA")]], g2m.genes_list[[paste0(sample, "_RNA")]]), 
                                    reduction.name = "pca_RNA_s.g2m.genes")
    projCC_list[[sample]] <- RunPCA(projCC_list[[sample]], assay = "SCT", 
                                    features = c(s.genes_list[[paste0(sample, "_SCT")]], g2m.genes_list[[paste0(sample, "_SCT")]]), 
                                    reduction.name = "pca_SCT_s.g2m.genes")
    
    plot_list[[paste0(sample, "_RNA")]] <- DimPlot(projCC_list[[sample]], reduction = "pca_RNA_s.g2m.genes", group.by = "Phase_RNA")
    plot_list[[paste0(sample, "_SCT")]] <- DimPlot(projCC_list[[sample]], reduction = "pca_SCT_s.g2m.genes", group.by = "Phase_SCT")    
}
options(repr.plot.width=10, repr.plot.height=5*length(samples))
ggarrange(plotlist = plot_list, ncol = 2, nrow = length(samples))

c) Single-cell embedding of cell cycle corrected objects

plot_list <- list()

for (sample in samples){
    projCC_list[[sample]] <- RunPCA(projCC_list[[sample]], assay = "RNA", reduction.name = "pca_RNA", 
                                  features = VariableFeatures(projCC_list[[sample]], assay = "RNA"))
    projCC_list[[sample]] <- RunPCA(projCC_list[[sample]], assay = "SCT", reduction.name = "pca_SCT", 
                                  features = VariableFeatures(projCC_list[[sample]], assay = "SCT"))
    plot_list[[paste0(sample, "_RNA")]] <- ElbowPlot(projCC_list[[sample]], reduction = "pca_RNA", ndims = 50) + ggtitle(sample)
    plot_list[[paste0(sample, "_SCT")]] <- ElbowPlot(projCC_list[[sample]], reduction = "pca_SCT", ndims = 50) + ggtitle(sample)
}
options(repr.plot.width=12, repr.plot.height=5*length(samples))
ggarrange(plotlist = plot_list, ncol = 2, nrow = length(samples))
PCs_list <- rep(25, 6)
names(PCs_list) <- paste0(rep(samples, each=2), c("_RNA", "_SCT"))
for (sample in samples){
    projCC_list[[sample]] <- FindNeighbors(projCC_list[[sample]], assay = "RNA", reduction = "pca_RNA", 
                                         dims = 1:PCs_list[[paste0(sample, "_RNA")]], graph.name = "RNA_snn")
    projCC_list[[sample]] <- FindNeighbors(projCC_list[[sample]], assay = "SCT", reduction = "pca_SCT", 
                                         dims = 1:PCs_list[[paste0(sample, "_SCT")]], graph.name = "SCT_snn")
    
    for (resolution in c(0.2, 0.5, 0.8)){
        projCC_list[[sample]] <- FindClusters(projCC_list[[sample]], resolution = resolution, graph.name = "RNA_snn")
        projCC_list[[sample]] <- FindClusters(projCC_list[[sample]], resolution = resolution, graph.name = "SCT_snn")
    }
}
for (sample in samples){
    projCC_list[[sample]] <- RunUMAP(projCC_list[[sample]], assay = "RNA", reduction = "pca_RNA", 
                                   dims = 1:PCs_list[[paste0(sample, "_RNA")]], reduction.name = "umap_RNA")
    projCC_list[[sample]] <- RunUMAP(projCC_list[[sample]], assay = "SCT", reduction = "pca_SCT", 
                                   dims = 1:PCs_list[[paste0(sample, "_SCT")]], reduction.name = "umap_SCT")
}
plot_list <- list()

for (sample in samples){
    plot_list[[sample]] <- list()
    
    plot_list[[sample]]$clusters0.2 <- DimPlot(projCC_list[[sample]], reduction = "umap_RNA", group.by = "RNA_snn_res.0.2")
    plot_list[[sample]]$clusters0.5 <- DimPlot(projCC_list[[sample]], reduction = "umap_RNA", group.by = "RNA_snn_res.0.5")
    plot_list[[sample]]$clusters0.8 <- DimPlot(projCC_list[[sample]], reduction = "umap_RNA", group.by = "RNA_snn_res.0.8")
    plot_list[[sample]]$Cellcycle <- DimPlot(projCC_list[[sample]], reduction = "umap_RNA", group.by = "Phase_RNA")
    
    plot_list[[sample]]$nCount <- FeaturePlot(projCC_list[[sample]], reduction = "umap_RNA", features = "nCount_RNA")
    plot_list[[sample]]$nFeature <- FeaturePlot(projCC_list[[sample]], reduction = "umap_RNA", features = "nFeature_RNA")
    plot_list[[sample]]$percentMT <- FeaturePlot(projCC_list[[sample]], reduction = "umap_RNA", features = "percent.mt")
    
}
options(repr.plot.width=25, repr.plot.height=12)
for (sample in samples){
    ggarrange(plotlist = plot_list[[sample]], ncol = 4, nrow = 2, labels = sample) %>% print(.)
}
plot_list <- list()

for (sample in samples){
    plot_list[[sample]] <- list()
    
    plot_list[[sample]]$clusters0.2 <- DimPlot(projCC_list[[sample]], reduction = "umap_SCT", group.by = "SCT_snn_res.0.2")
    plot_list[[sample]]$clusters0.5 <- DimPlot(projCC_list[[sample]], reduction = "umap_SCT", group.by = "SCT_snn_res.0.5")
    plot_list[[sample]]$clusters0.8 <- DimPlot(projCC_list[[sample]], reduction = "umap_SCT", group.by = "SCT_snn_res.0.8")
    plot_list[[sample]]$Cellcycle <- DimPlot(projCC_list[[sample]], reduction = "umap_SCT", group.by = "Phase_SCT")
    
    plot_list[[sample]]$nCount <- FeaturePlot(projCC_list[[sample]], reduction = "umap_SCT", features = "nCount_RNA")
    plot_list[[sample]]$nFeature <- FeaturePlot(projCC_list[[sample]], reduction = "umap_SCT", features = "nFeature_RNA")
    plot_list[[sample]]$percentMT <- FeaturePlot(projCC_list[[sample]], reduction = "umap_SCT", features = "percent.mt")

}
options(repr.plot.width=25, repr.plot.height=12)
for (sample in samples){
    ggarrange(plotlist = plot_list[[sample]], ncol = 4, nrow = 2, labels = sample) %>% print(.)
}

7. Save results

### Save Seurat objects
qsave(proj_list, 'SeuratObjects.qs')
qsave(projCC_list, 'SeuratObjects_CCcorrected.qs')
### Save Sample statistics
write.table(summary, 'Summary.csv', sep=',', col.names = T, row.names = F)
lapply(samples, function(sample){all(rownames(proj_list[[sample]]@meta.data) == rownames(proj_list[[sample]]@reductions$umap_RNA@cell.embeddings) &
                                rownames(proj_list[[sample]]@meta.data) == rownames(proj_list[[sample]]@reductions$umap_SCT@cell.embeddings) &
                                rownames(proj_list[[sample]]@meta.data) == rownames(projCC_list[[sample]]@meta.data) & 
                                rownames(proj_list[[sample]]@meta.data) == rownames(projCC_list[[sample]]@reductions$umap_RNA@cell.embeddings) &
                                rownames(proj_list[[sample]]@meta.data) == rownames(projCC_list[[sample]]@reductions$umap_SCT@cell.embeddings)) %>% print(.);
                            df <- cbind(proj_list[[sample]]@meta.data,
                                            proj_list[[sample]]@reductions$umap_RNA@cell.embeddings, 
                                            proj_list[[sample]]@reductions$umap_SCT@cell.embeddings,
                                            projCC_list[[sample]]@reductions$umap_RNA@cell.embeddings, 
                                            projCC_list[[sample]]@reductions$umap_SCT@cell.embeddings)
                            colnames(df) <- c(colnames(proj_list[[sample]]@meta.data),
                                              colnames(proj_list[[sample]]@reductions$umap_RNA@cell.embeddings), 
                                              colnames(proj_list[[sample]]@reductions$umap_SCT@cell.embeddings),
                                              paste0(colnames(projCC_list[[sample]]@reductions$umap_RNA@cell.embeddings), "_CCcorrected"), 
                                              paste0(colnames(projCC_list[[sample]]@reductions$umap_SCT@cell.embeddings), "_CCcorrected"))
                            write.csv(df, paste0("RawData_", sample, ".csv"))})
[1] TRUE
[1] TRUE
[1] TRUE
  1. NULL
  2. NULL
  3. NULL

Isabelle Seufert
Division of Chromatin Networks, DKFZ
12.09.2023

sessionInfo()
R version 4.2.2 (2022-10-31)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: CentOS Linux 7 (Core)

Matrix products: default
BLAS/LAPACK: /home/bq_ilander/miniconda3/envs/RWireX_v0.2.02/lib/libopenblasp-r0.3.21.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] parallel  stats     graphics  grDevices utils     datasets  methods  
[8] base     

other attached packages:
 [1] ROCR_1.0-11         KernSmooth_2.23-20  fields_14.1        
 [4] viridis_0.6.3       viridisLite_0.4.2   spam_2.9-1         
 [7] future_1.29.0       RhpcBLASctl_0.23-42 DoubletFinder_2.0.3
[10] reticulate_1.26     ggpubr_0.4.0        ggplot2_3.4.2      
[13] patchwork_1.1.2     SeuratObject_4.1.3  Seurat_4.3.0       
[16] dplyr_1.0.10        qs_0.25.4          

loaded via a namespace (and not attached):
  [1] uuid_1.1-0             backports_1.4.1        systemfonts_1.0.4     
  [4] plyr_1.8.7             igraph_1.3.5           repr_1.1.4            
  [7] lazyeval_0.2.2         sp_1.5-1               splines_4.2.2         
 [10] RApiSerialize_0.1.2    listenv_0.8.0          scattermore_0.8       
 [13] digest_0.6.30          htmltools_0.5.3        fansi_1.0.4           
 [16] magrittr_2.0.3         tensor_1.5             cluster_2.1.4         
 [19] globals_0.16.1         RcppParallel_5.1.5     matrixStats_0.62.0    
 [22] R.utils_2.12.1         spatstat.sparse_3.0-0  colorspace_2.1-0      
 [25] ggrepel_0.9.2          textshaping_0.3.6      crayon_1.5.2          
 [28] jsonlite_1.8.3         progressr_0.11.0       spatstat.data_3.0-0   
 [31] stringfish_0.15.7      survival_3.4-0         zoo_1.8-11            
 [34] glue_1.6.2             polyclip_1.10-4        gtable_0.3.3          
 [37] leiden_0.4.3           car_3.1-1              future.apply_1.10.0   
 [40] maps_3.4.1             abind_1.4-5            scales_1.2.1          
 [43] DBI_1.1.3              rstatix_0.7.1          spatstat.random_3.1-3 
 [46] miniUI_0.1.1.1         Rcpp_1.0.9             xtable_1.8-4          
 [49] dotCall64_1.0-2        htmlwidgets_1.5.4      httr_1.4.4            
 [52] RColorBrewer_1.1-3     ellipsis_0.3.2         ica_1.0-3             
 [55] pkgconfig_2.0.3        R.methodsS3_1.8.2      farver_2.1.1          
 [58] uwot_0.1.14            deldir_1.0-6           utf8_1.2.3            
 [61] tidyselect_1.2.0       labeling_0.4.2         rlang_1.1.1           
 [64] reshape2_1.4.4         later_1.3.0            munsell_0.5.0         
 [67] tools_4.2.2            cli_3.6.1              generics_0.1.3        
 [70] broom_1.0.1            ggridges_0.5.4         evaluate_0.18         
 [73] stringr_1.4.1          fastmap_1.1.0          ragg_1.2.4            
 [76] goftest_1.2-3          fitdistrplus_1.1-8     purrr_0.3.5           
 [79] RANN_2.6.1             pbapply_1.7-0          nlme_3.1-160          
 [82] mime_0.12              R.oo_1.25.0            compiler_4.2.2        
 [85] plotly_4.10.1          png_0.1-7              ggsignif_0.6.4        
 [88] spatstat.utils_3.0-1   tibble_3.2.1           stringi_1.7.8         
 [91] lattice_0.20-45        IRdisplay_1.1          Matrix_1.5-3          
 [94] vctrs_0.6.2            pillar_1.9.0           lifecycle_1.0.3       
 [97] spatstat.geom_3.0-6    lmtest_0.9-40          RcppAnnoy_0.0.20      
[100] data.table_1.14.4      cowplot_1.1.1          irlba_2.3.5.1         
[103] httpuv_1.6.6           R6_2.5.1               promises_1.2.0.1      
[106] gridExtra_2.3          parallelly_1.32.1      codetools_0.2-18      
[109] MASS_7.3-58.1          assertthat_0.2.1       withr_2.5.0           
[112] sctransform_0.3.5      grid_4.2.2             IRkernel_1.3.1        
[115] tidyr_1.2.1            carData_3.0-5          Cairo_1.6-0           
[118] Rtsne_0.16             pbdZMQ_0.3-8           spatstat.explore_3.0-6
[121] shiny_1.7.3            base64enc_0.1-3